#' ---
#' title: "Campaign Communication and Legislative Leadership (PSRM)"
#' subtitle: "02_prepare_bert_data.R"
#' author: "Authors: Stefan Mueller and Naofumi Fujimura"
#' date: "Note: Code compiled successfully on `r format(Sys.time(), '%d %B %Y')`"
#' ---

# load packages
library(dplyr)               # CRAN v1.1.2
library(readr)               # CRAN v2.1.4
library(quanteda)            # CRAN v3.3.1
library(quanteda.textmodels) # CRAN v0.9.6
library(quanteda.textstats)  # CRAN v0.96.3
library(rio)                 # CRAN v0.5.29


# If the code does not run, one or more packages may have been 
# updated, which may result in errors or conflicts. You can solve this issue
# by installing the package version listed above or by using the 
# groundhog package:
# after installing groundhog using install.packages("groundhog")
# change library(name_of_package) to
# groundhog::groundhog.library(name_of_package, date = "2024-01-31")
# Instead of adjusting the library() function for each package, 
# you can adjust them at all once using the
# the following syntax:
# groundhog.library("library('pkgA')
#                   library('pkgB')
#                   library('pkgC')", date = "2024-01-31")
# More details are available at: https://groundhogr.com/using/

# print output of sessionInfo()
sessionInfo()

# load corpus containing candidate manifestos
corp <- readRDS(file = "data_corpus_japmanifestos.rds")


# get better doc_id variable
corp$doc_id <- docnames(corp)

# overview of manifestos by election
table(corp$election_year)

# specify patterns for statement segmentation
pattern_sentence <- "[.?!！•–。●■○・]"

ndoc(corp)

corp_statement <- corp %>% 
    corpus_segment(pattern = pattern_sentence, 
                   valuetype = "regex",
                   pattern_position = "after")

length(unique(corp$doc_id))
length(unique(corp_statement$doc_id))

setdiff(corp$doc_id, corp_statement$doc_id)

# convert to data frame
dat_statements <- data.frame(
    text = as.character(corp_statement),
    docvars(corp_statement)
)

# get number of tokens per statement
ntoken_corp <- ntoken(tokens(corp_statement, remove_punct = TRUE))

ndoc(corp_statement)

# remove empty documents
corp_statement <- corpus_subset(corp_statement, ntoken_corp > 0)

ndoc(corp_statement)

# get texts
texts <- data.frame(
    text = as.character(corp_statement))

# get data for training the classifier
dat <- read.csv("data_handcoded_sentences.csv", 
          fileEncoding = "utf-8")

set.seed(235)

# reshuffle rows
rows <- sample(nrow(dat))

dat_reshuffled <- dat[rows, ]

# remove Justice Affairs
dat_reshuffled <- filter(dat_reshuffled, policy_area != "Justice Affairs")

# save data for BERT classification
dat_reshuffled <- dat_reshuffled |> 
    mutate(policy_area_num = (as.numeric(as.factor(policy_area)) - 1))

table(dat_reshuffled$policy_area_num)

dat_bert <- dplyr::select(dat_reshuffled, 
                   policy_area_num,
                   policy_area, text = sentence) 

dat_bert$ntoken <- corpus(dat_bert) |> 
    tokens() |> 
    ntoken()

length(unique(dat_bert$policy_area_num))

table(dat_bert$policy_area)

# split sentences for fine-tuning
train <- dat_bert[1:2000, ]
eval <- dat_bert[2001:2500, ]
test <- dat_bert[2501: nrow(dat_bert),]

# make sure all areas included in each dataframe
length(unique(test$policy_area_num))
length(unique(train$policy_area_num))
length(unique(eval$policy_area_num))

table(test$policy_area_num)
table(train$policy_area_num)
table(eval$policy_area_num)

nrow(test)
nrow(train)
nrow(eval)

# save data as as CSV
write_csv(train, "data_sentences_train.csv")
write_csv(eval, "data_sentences_eval.csv")
write_csv(test, "data_sentences_test.csv")


# store corp_statement for BERT as data frame

ndoc(corp_statement)

dat_statements_all <- data.frame(
    text = as.character(corp_statement),
    docvars(corp_statement)
) 

# make sure text is not stored as NA
dat_statements_all_no_na <- dat_statements_all |> 
    mutate(text = dplyr::recode(text, "NA" = "NAS"))

nrow(dat_statements_all_no_na)
nrow(dat_statements_all)

nrow(dat_statements_all_no_na)

# get number of tokens
toks <- ntoken(corpus(dplyr::select(dat_statements_all_no_na, -doc_id)))

# number of rows/statements
nrow(dat_statements_all_no_na)

# save for BERT classification
write_csv(dat_statements_all_no_na, "data_sentences_all.csv")
